TODO

  • examine use of na.omit, authors_include_bh, and contents of clinically_relevant
  • NOTE SOMEWHERE THAT N IN EACH CELL IN THE TABLE IS TOTAL N FOR THE STUDY, EVEN FOR THE INDEPENDENT T TESTS
# dependencies
library(tidyverse)
library(effectsize)
library(pwr)
library(irr)
library(metafor)
library(forestploter)
library(janitor)
library(greekLetters)
library(knitr)
library(kableExtra)

dir.create("plots")

# function to round all numeric vars in a data frame
round_df <- function(df, n_digits = 3) {
  df %>% mutate_if(is.numeric, janitor::round_half_up, digits = n_digits)
}

# apa format p value -----
apa_p_value <- function(p){
  p_formatted <- ifelse(p >= 0.001, paste("=", round_half_up(p, 3)),
                        ifelse(p < 0.001, "< .001", NA))
  p_formatted <- gsub(pattern = "0.", replacement = ".", x = p_formatted, fixed = TRUE)
  p_formatted
}

# meta results string
meta_effect_string <- function(fit, predictions){
  paste0("k = ", fit$k, ", r = ", predictions$pred,
         ", 95% CI [", predictions$ci.lb, ", ", predictions$ci.ub, "]",
         ", 95% CR [", predictions$cr.lb, ", ", predictions$cr.ub, "]",
         ", 95% PI [", predictions$pi.lb, ", ", predictions$pi.ub, "]",
         ", p = ", signif(2*pnorm(-abs(fit$zval)), digits = 1))  # exact p value from z score
}

# notation off
options(scipen = 999)

# plot theme
custom_theme <- 
  forest_theme(base_size    = 10,
               ci_lty       = 1,
               ci_lwd       = 1.5,
               ci_Theight   = 0.2,
               summary_fill = "black",
               summary_col  = "black",
               footnote_cex = 0.8)

# get data
## Vahey et al.'s effect sizes
data_vahey <- read.csv("../data/data_vahey_et_al_2015.csv")


## get effect sizes reextracted from the original articles
data_reextracted <- read.csv("../data/data_reextracted_and_annotated.csv") %>%
  # remove effects that merely repeat what vahey found, which are replaced by other rows that do new extractions for a corrispondonding effect. 
  filter(replaced_effect == FALSE) %>%
  # mark each effect as clinically relevant only if both raters scored it as such
  rowwise() %>%
  mutate(clinically_relevant = 
           as.logical(max(c(clinically_relevant_criterion_rater_1,
                            clinically_relevant_criterion_rater_2)))) %>%
  ungroup()

### remove non-criterion effects
data_reextracted_criterion <- data_reextracted %>%
  filter(criterion_effect == TRUE) 

### remove non-clinical criterion variables
data_reextracted_criterion_clinically_relevant <- data_reextracted_criterion %>%
  filter(clinically_relevant == TRUE) 


## sample sizes in published IRAP research, taken from a separate project
data_sample_sizes <- read_csv("../data/sample size data/data_irap.csv") 

data_sample_sizes_vector <- data_sample_sizes |>
  drop_na(n_participants_after_exclusions) |>
  pull(n_participants_after_exclusions)

Reproducibility of original analylses

Power analyses

Calculated from meta analytic effect sizes reported in forest plot and text.

Vahey et al.’s reported meta effect size estimate was r = .45, 95% CI [.40, .54], 95% CR [.23, .67]. Note that this is quite a large effect size, equivalent to Cohen’s d = 1.01.

Using this effect size, they conducted power analyses.

  • They reported that: “based on the current meta-effect [r = .45], a sample size of 29 participants would provide a study with a statistical power of .80 when examining the statistical significance of first-order Pearson’s r correlations between clinically-focused IRAP effects and corresponding criterion variables” (p.63); and
  • “Adopting a conservative approach in favour of controlling for overly optimistic publication biases, the most recent recommendation is to calculate sample size requirements not in terms of a given meta-effect, but rather in terms of the lower bound of its associated confidence interval (Perugini, Gallucci, & Costantini, 2014). Given that we obtained a confidence interval of (.40, .54) around the present meta-effect, Perugini et al.’s approach implies that a sample size of at least N = 37 would be required in order to achieve a statistical power of .80 when testing a continuous first-order correlation between a clinically-focused IRAP effect and a given criterion variable” (p.63).

I used the R packages pwr and effectsize to attempt to reproduce these values.

data_power_verified <- 
  tibble(
    test = c("Pearson’s r",
             "Pearson’s r",
             "Pearson’s r",
             "Pearson’s r",
             "Independent t-test (Cohen's d)",
             "Independent t-test (Cohen's d)",
             "Dependent t-test (Cohen's d)",
             "Dependent t-test (Cohen's d)"),
    tails = c("One-tailed",
              "One-tailed",
              "Two-tailed",
              "Two-tailed",
              "One-tailed",
              "One-tailed",
              "One-tailed",
              "One-tailed"),
    estimate = c("Point estimate",
                 "Lower bound of 95% Confidence Interval",
                 "Point estimate",
                 "Lower bound of 95% Confidence Interval",
                 "Point estimate",
                 "Lower bound of 95% Confidence Interval",
                 "Point estimate",
                 "Lower bound of 95% Confidence Interval"),
    effect_size_vahey = c(0.45, 0.4, 0.45, 0.4, 1.01, 0.87, 1.01, 0.87),
    required_n_vahey = c(29L, 37L, 36L, NA, 26L, 36L, 8L, 10L)
  )

point_estimate_vahey <- .45
lower_ci_vahey <- .40

data_power_verified$required_n_verified[1] <- 
  pwr.r.test(n = NULL, 
             r = point_estimate_vahey, 
             sig.level = 0.05, 
             power = 0.8, 
             alternative = "greater")$n %>%
  ceiling()

data_power_verified$required_n_verified[2] <- 
  pwr.r.test(n = NULL, 
             r = lower_ci_vahey, 
             sig.level = 0.05, 
             power = 0.8, 
             alternative = "greater")$n %>%
  ceiling()

data_power_verified$required_n_verified[3] <- 
  pwr.r.test(n = NULL, 
             r = point_estimate_vahey, 
             sig.level = 0.05, 
             power = 0.8, 
             alternative = "two.sided")$n %>%
  ceiling()

data_power_verified$required_n_verified[4] <- 
  pwr.r.test(n = NULL, 
             r = lower_ci_vahey, 
             sig.level = 0.05, 
             power = 0.8, 
             alternative = "two.sided")$n %>%
  ceiling()

data_power_verified$required_n_verified[5] <- 
  pwr.t.test(n = NULL, 
             d = r_to_d(point_estimate_vahey), 
             type = "two.sample",
             sig.level = 0.05, 
             power = 0.8, 
             alternative = "greater")$n %>%
  ceiling()*2

data_power_verified$required_n_verified[6] <- 
  pwr.t.test(n = NULL, 
             d = r_to_d(lower_ci_vahey), 
             type = "two.sample",
             sig.level = 0.05, 
             power = 0.8, 
             alternative = "greater")$n %>%
  ceiling()*2

data_power_verified$required_n_verified[7] <- 
  pwr.t.test(n = NULL, 
             d = r_to_d(point_estimate_vahey), 
             type = "paired",
             sig.level = 0.05, 
             power = 0.8, 
             alternative = "greater")$n %>%
  ceiling()

data_power_verified$required_n_verified[8] <- 
  pwr.t.test(n = NULL, 
             d = r_to_d(lower_ci_vahey), 
             type = "paired",
             sig.level = 0.05, 
             power = 0.8, 
             alternative = "greater")$n %>%
  ceiling()

data_power_verified %>%
  kable() %>%
  kable_classic(full_width = FALSE)
test tails estimate effect_size_vahey required_n_vahey required_n_verified
Pearson’s r One-tailed Point estimate 0.45 29 29
Pearson’s r One-tailed Lower bound of 95% Confidence Interval 0.40 37 37
Pearson’s r Two-tailed Point estimate 0.45 36 36
Pearson’s r Two-tailed Lower bound of 95% Confidence Interval 0.40 NA 46
Independent t-test (Cohen’s d) One-tailed Point estimate 1.01 26 26
Independent t-test (Cohen’s d) One-tailed Lower bound of 95% Confidence Interval 0.87 36 34
Dependent t-test (Cohen’s d) One-tailed Point estimate 1.01 8 8
Dependent t-test (Cohen’s d) One-tailed Lower bound of 95% Confidence Interval 0.87 10 10

Meta-analytic effect size and intervals

Calculated from weighted average effect sizes reported in forest plot.

The power analyses relied on the accuracy of the meta-analytic effect size. So, I then attempted to computationally reproduce the meta-analytic effect size from the effect sizes, confidence intervals, credibility intervals, and sample sizes reported in Vahey et al.’s forest plot and their description of their meta-analytic method in their manuscript.

In an email, Vahey stated that they performed a Hunter & Schmidt meta analysis, following the guide of Field & Gillett (2010) and using their code.

On inspection, Field & Gillett (2010) make a distinction between the Hunter & Schmidt method, which is distinctive for reporting credibility intervals rather than confidence intervals, and the Hedges and colleagues’ method, which is distinctive for using Fisher’s r-to-z transformations.

Vahey et al. (2015) do not report applying any transformations to their data. However, Vahey et al.’s (2015) individual effect sizes in their forest plot have asymmetric confidence intervals. This implies that a transformation was performed. It is therefore possible that Vahey et al. combined the two methods (and code) provided by Field & Gillett (2010). I first fit a Hunter & Schmidt method, then a combined Hunter & Schmidt and Hedges’ and colleagues method.

Hunter & Schmidt method using metafor R package

Information about implementation of Hunter & Schmidt-style meta-analysis in metafor from here.

total_n <- data_vahey %>%
  distinct(article, n_forest_plot) %>%
  drop_na() %>%
  summarize(total_n = sum(n_forest_plot, na.rm = TRUE)) %>%
  pull(total_n)

# calculate variances
data_recalculated_variance <- 
  escalc(measure = "COR", 
         ri      = mean_r_forest_plot_numeric, 
         ni      = n_forest_plot, 
         data    = data_vahey, 
         vtype   = "AV",
         digits  = 10) %>%
  drop_na() %>%
  select(article, article_abbreviated,
         yi, vi, ni = n_forest_plot) %>%
  mutate(lower = yi - sqrt(vi)*1.96,
         upper = yi + sqrt(vi)*1.96)

# fit model
fit_reproduced <- 
  rma(yi      = yi, 
      vi      = vi, 
      weights = ni, # Hunter Schmidt method requires weighting by N
      method  = "HS", # Hunter Schmidt method
      data    = data_recalculated_variance,
      slab    = article_abbreviated,
      digits  = 10)

predictions_reproduced <- 
  predict(fit_reproduced) %>%
  as.data.frame() %>%
  select(-se) %>%
  # cr following Field & Gillett (2010) equation 5, where s_r^2 == tau2 as they both estimate the variance in population correlations. e.g., see Field's script "h_s syntax.sps" which refers to this as both tau and s_r^2 depending on the meta analytic approach
  mutate(cr.lb = pred - 1.96*sqrt(fit_reproduced$tau2),
         cr.ub = pred + 1.96*sqrt(fit_reproduced$tau2))

predictions_reproduced_printing <- predictions_reproduced %>%
  round_df(2)

# predictions_reproduced %>%
#   round_df(2) %>%
#   kable() %>%
#   kable_classic(full_width = FALSE)

# plot
data_forest <- data_recalculated_variance %>%
  drop_na() %>%
  select(article_abbreviated,
         n = ni,
         r = yi,
         lower,
         upper) %>%
  # bind_rows(tibble(article_abbreviated = "Meta",
  #                  n = 35, # arbitrary number to make the diamond an appropriate size
  #                  r = predictions_reproduced$pred,
  #                  lower = predictions_reproduced$pi.lb,
  #                  upper = predictions_reproduced$pi.ub)) %>%
  bind_rows(tibble(article_abbreviated = c("Meta (confidence interval)", 
                                           "Meta (credibility interval)"),
                   n = 35, # arbitrary number to make the diamond an appropriate size
                   r = c(predictions_reproduced$pred, 
                         predictions_reproduced$pred),
                   lower = c(predictions_reproduced$ci.lb,
                             predictions_reproduced$cr.lb),
                   upper = c(predictions_reproduced$ci.ub,
                             predictions_reproduced$cr.ub))) %>%
  mutate(size = n/sum(n),
         n = ifelse(str_detect(article_abbreviated, "Meta"), total_n, n),
         article_abbreviated = fct_relevel(article_abbreviated,
                                           "Meta (confidence interval)", 
                                           "Meta (credibility interval)")) %>%
  mutate(` ` = paste(rep(" ", 30), collapse = " ")) %>%
  select(Article = article_abbreviated, ` `, r, lower, upper, n, size) %>%
  round_df(2)

p_reproduced <- 
  forestploter::forest(data       = select(data_forest, -size),
                       est        = data_forest$r,
                       lower      = data_forest$lower, 
                       upper      = data_forest$upper,
                       sizes      = 1/data_forest$size,
                       #is_summary = c(rep(FALSE, nrow(data_forest)-1), TRUE),
                       is_summary = c(rep(FALSE, nrow(data_forest)-2), TRUE, TRUE),
                       ci_column  = 2,
                       ref_line   = 0,
                       theme      = custom_theme,
                       xlim       = c(-0.3, 1.1),
                       ticks_at   = c(-.2, 0, .2, .4, .6, .8, 1.0))

p_reproduced

ggplot2::ggsave(filename = "plots/forest plot Hunter & Schmidt method.pdf", 
                plot = p_reproduced,
                device = "pdf",
                width = 7.5, 
                height = 4.5, 
                units = "in")

Meta effect: k = 15, r = 0.47, 95% CI [0.4, 0.54], 95% CR [0.47, 0.47], 95% PI [0.4, 0.54], p = 0.000000000000000000000000000000000000004.

Hunter & Schmidt method with Hedges’ and colleagues transformations using metafor R package

Fisher’s r-to-z transformation and back transformation, with HS estimator. No Overton (1998) transformation.

# calculate variances
data_recalculated_variance_transformed <- data_vahey %>%
  # # Overton transformation. Unnumbered equation between equations 8 and 9 in Field & Gillett (2010)
  # mutate(mean_r_forest_plot_numeric = mean_r_forest_plot_numeric - (mean_r_forest_plot_numeric*(1-mean_r_forest_plot_numeric^2))/(2*(n_forest_plot-3))) %>% 
  escalc(measure = "ZCOR", 
         ri      = mean_r_forest_plot_numeric, 
         ni      = n_forest_plot, 
         data    = ., 
         vtype   = "AV",
         digits  = 10) %>%
  drop_na() %>%
  select(article, article_abbreviated,
         yi, vi, ni = n_forest_plot) %>%
  mutate(lower = yi - sqrt(vi)*1.96,
         upper = yi + sqrt(vi)*1.96)

# fit model
fit_reproduced_transformed <- 
  rma(yi      = yi, 
      vi      = vi, 
      weights = ni, # Hunter Schmidt method requires weighting by N
      method  = "HS", # Hunter Schmidt method
      data    = data_recalculated_variance_transformed,
      slab    = article_abbreviated,
      digits  = 10)

predictions_reproduced_transformed <- 
  predict(fit_reproduced_transformed) %>%
  as.data.frame() %>%
  select(-se) %>%
  # cr following Field & Gillett (2010) equation 5, where s_r^2 == tau2 as they both estimate the variance in population correlations. e.g., see Field's script "h_s syntax.sps" which refers to this as both tau and s_r^2 depending on the meta analytic approach
  mutate(cr.lb = pred - 1.96*sqrt(fit_reproduced$tau2),
         cr.ub = pred + 1.96*sqrt(fit_reproduced$tau2))

predictions_reproduced_transformed_backtransformed <-
  predictions_reproduced_transformed %>%
  mutate_all(transf.ztor) %>%
  round_df(2)

# plot
data_forest_transformed <- data_recalculated_variance_transformed %>%
  select(article_abbreviated,
         n = ni,
         r = yi,
         lower,
         upper) %>%
  # bind_rows(tibble(article_abbreviated = "Meta",
  #                  n = 35, # arbitrary number to make the diamond an appropriate size
  #                  r = predictions_reproduced_transformed$pred,
  #                  lower = predictions_reproduced_transformed$pi.lb,
  #                  upper = predictions_reproduced_transformed$pi.ub)) %>%
  bind_rows(tibble(article_abbreviated = c("Meta (confidence interval)", 
                                           "Meta (credibility interval)"),
                   n = 35, # arbitrary number to make the diamond an appropriate size
                   r = c(predictions_reproduced_transformed$pred,
                         predictions_reproduced_transformed$pred),
                   lower = c(predictions_reproduced_transformed$ci.lb,
                             predictions_reproduced_transformed$cr.lb),
                   upper = c(predictions_reproduced_transformed$ci.ub,
                             predictions_reproduced_transformed$cr.ub))) %>%
  mutate(r = transf.ztor(r),
         lower = transf.ztor(lower),
         upper = transf.ztor(upper)) %>%
  mutate(size = n/sum(n),
         n = ifelse(str_detect(article_abbreviated, "Meta"), total_n, n),
         article_abbreviated = fct_relevel(article_abbreviated,
                                           "Meta (confidence interval)", 
                                           "Meta (credibility interval)")) %>%
  mutate(` ` = paste(rep(" ", 30), collapse = " ")) %>%
  select(Article = article_abbreviated, ` `, r, lower, upper, n, size) %>%
  round_df(2)

p_reproduced_transformed <- 
  forestploter::forest(data       = select(data_forest_transformed, -size),
                       est        = data_forest_transformed$r,
                       lower      = data_forest_transformed$lower, 
                       upper      = data_forest_transformed$upper,
                       sizes      = 1/data_forest_transformed$size,
                       #is_summary = c(rep(FALSE, nrow(data_forest)-1), TRUE),
                       is_summary = c(rep(FALSE, nrow(data_forest)-2), TRUE, TRUE),
                       ci_column  = 2,
                       ref_line   = 0,
                       theme      = custom_theme,
                       xlim       = c(-0.3, 1.1),
                       ticks_at   = c(-.2, 0, .2, .4, .6, .8, 1.0))

p_reproduced_transformed

ggplot2::ggsave(filename = "plots/forest plot Hunter & Schmidt method with Fisher's r-to-z transformations.pdf", 
                plot = p_reproduced_transformed,
                device = "pdf",
                width = 7.5, 
                height = 4.5, 
                units = "in")

Meta effect: k = 15, r = 0.47, 95% CI [0.4, 0.54], 95% CR [0.47, 0.47], 95% PI [0.4, 0.54], p = 0.000000000000000000000000002.

Average effect sizes

Reported in forest plot, calculated from individual effect sizes in supplementary materials.

Next, I attempted to reproduce the weighted-mean effect sizes reported in Vahey et al.’s forest plot from the individual effect sizes they reported in their supplementary materials.

Vahey et al. reported that they weighted by degrees of freedom, and I do the same.

I noted that some of the confidence intervals in Vahey et al.’s forest plot are asymmetrical around the point estimate. This is uncommon and not accounted for by Vahey et al. detailing of how they calculated the effect sizes and their confidence intervals. However, I take them at face value as they are the most detailed data available to work from.

# recalculate and compare
data_weighted_effect_sizes <- data_vahey %>%
  select(article = article_abbreviated, 
         r_supplementary, 
         df_supplementary) %>%
  group_by(article) %>%
  summarize(r_recalculated_weighted_mean = round_half_up(weighted.mean(x = r_supplementary, w = df_supplementary), 2)) %>%
  ungroup() %>%
  left_join(data_vahey %>% 
              select(article = article_abbreviated, 
                     mean_r_forest_plot_numeric) %>%
              drop_na(),
            by = "article") %>%
  mutate(congruence = ifelse(janitor::round_half_up(r_recalculated_weighted_mean, 2) == janitor::round_half_up(mean_r_forest_plot_numeric, 2), "Congruent", "Incongruent"),
         congruence = fct_relevel(congruence, "Congruent", "Incongruent"),
         difference = r_recalculated_weighted_mean - mean_r_forest_plot_numeric)

# plot
p_comparison_reaveraged <- 
  ggplot(data_weighted_effect_sizes, aes(mean_r_forest_plot_numeric, r_recalculated_weighted_mean)) +
  geom_abline(slope = 1, linetype = "dotted") +
  geom_point(aes(color = congruence, shape = congruence), size = 2.25) +
  theme_classic() +
  scale_color_viridis_d(begin = 0.25, end = 0.75) + 
  theme(legend.position = "top", # c(0.25, 0.85),
        legend.title = element_blank()) + 
  #legend.box.background = element_rect(colour = "black")) +
  xlim(0.35, 0.75) +
  ylim(0.35, 0.75) +
  xlab("Reported in Vahey et al.'s (2015)\nforest plot") +
  ylab("Recalculated from Vahey et al.'s (2015)\nsupplementary materials") +
  annotate("text", x = .66, y = .45, size = 3, label = "Overestimates validity") +
  annotate("text", x = .45, y = .66, size = 3, label = "Underestimates validity")

p_comparison_reaveraged

ggplot2::ggsave(filename = "plots/scatter plot comparing effect sizes in forest plot to reaveraged from supplementary materials.pdf",
                plot = p_comparison_reaveraged,
                device = "pdf",
                width = 4.5,
                height = 4.5,
                units = "in")

# table
data_weighted_effect_sizes %>%
  summarize(n_incongruent = sum(ifelse(congruence == "Incongruent", TRUE, FALSE)),
            percent_incongruent = round_half_up(mean(ifelse(congruence == "Incongruent", TRUE, FALSE)), 2)*100) %>%
  kable() %>%
  kable_classic(full_width = FALSE)
n_incongruent percent_incongruent
2 13
data_weighted_effect_sizes %>%
  filter(difference != 0) %>%
  select(article, r_recalculated_weighted_mean, mean_r_forest_plot_numeric, difference, congruence) %>%
  round_df(2) %>%
  kable() %>%
  kable_classic(full_width = FALSE)
article r_recalculated_weighted_mean mean_r_forest_plot_numeric difference congruence
Nicholson, Dempsey et al. (2013) 0.46 0.48 -0.02 Incongruent
Vahey et al. (2009) 0.58 0.53 0.05 Incongruent

The weighted effect sizes reported in Vahey et al.’s forest plot could not be computationally reproduced from the individual effect sizes and degrees of freedom reported in their supplementary materials.

Many values were congruent, but a minority differed (k = 2 [13%]). Table includes those that differed.

Individual effect sizes

Assessment of incorrect inclusion

Laken’s (2016) describes incorrect inclusions as inclusion of effect sizes that do not meet the inclusion criteria. Vahey et al. (2015) stated that the purpose of their meta-analysis was to “quantify how much IRAP effects from clinically-relevant responding co-vary with corresponding clinically-relevant criterion variables” (p.60). Their inclusion criterion was that “the IRAP and criterion variables must have been deemed to target some aspect of a condition included in a major psychiatric diagnostic scheme such as the Diagnostic and Statistical Manual of Mental Disorders (DSM-5, 2013) … The authors decided whether the responses measured by a given IRAP trial-type should co-vary with a specific criterion variable by consulting the relevant empirical literature.” (p.60). References to neither the specific clinical condition that was targeted by the IRAP and the criterion variable nor the specific empirical literature that Vahey et al. (2015) used to justify the inclusion of each criterion were provided in the original article or supplementary materials. Nonetheless, Vahey’s own inclusion criterion required that effects referred to covariation between an IRAP and an external clinically-relevant criterion variable, consistent with the APA dictionary of psychology definition of criterion validity (REF).

data_vahey %>%
  count(involves_external_criterion) %>%
  kable() %>%
  kable_classic(full_width = FALSE)
involves_external_criterion n
FALSE 23
TRUE 33

Assessment of incorrect exclusion

“Collectively, these 15 articles yielded 56 statistical effects between various clinically focused IRAP effects and their respective criterion variables. … of the 56 statistical effects only 8 (i.e. 14%) were not initially cited by both authors for inclusion.” (Vahey et al., 2015, p. 60)

Vahey et al. did not report how many effect sizes they excluded.

I reviewed these same 15 articles and followed Vahey et al.’s (2015) same inclusion criteria (i.e., like Vahey, I did not limit myself to effect sizes reported in the articles but also considered ones implied within the articles. Like Vahey et al., where necessary I contacted the original authors to obtain additional information about effect sizes).

I extracted 308 effect sizes. Of these, 255 met the inclusion criterion of being a criterion effect. Of these, 171 met the inclusion criterion of being clincially relevant by Vahey et al.’s definition. This was 3.1 times the number extracted by Vahey et al. 

Inter rater reliability

data_raters <- data_reextracted_criterion %>%
  select(clinically_relevant_criterion_rater_1, clinically_relevant_criterion_rater_2)

data_raters %>%
  mutate(agreement = ifelse(clinically_relevant_criterion_rater_1 == clinically_relevant_criterion_rater_2, TRUE, FALSE)) %>%
  summarize(interrater_percent_agreement = round_half_up(mean(agreement, na.rm = TRUE), 1)) %>%
  kable() %>%
  kable_classic(full_width = FALSE)
interrater_percent_agreement
0.9
kappa2(data_raters, "unweighted")
##  Cohen's Kappa for 2 Raters (Weights: unweighted)
## 
##  Subjects = 255 
##    Raters = 2 
##     Kappa = 0.872 
## 
##         z = 14 
##   p-value = 0

Reanalyses

Meta-analysis including all effects meeting Vahey et al.’s own inclusion and exclusion criteria

Excluding non-clinically relevant and effects that do not involve a relationship with an external variable, which cannot provide evidence of criterion validity.

Details of the meta-analytic strategy, which used contemporary methods:

  • Weighting by inverse variance rather than sample size, as sample size provides a poorer proxy of measurement error. This is now more standard practice in meta-analysis.
  • Fishers r-to-z transformations to deal with ceiling effects on confidence intervals.
  • REML estimator rather than Hunter & Schmidt.
  • Multilevel meta-analysis model rather than weighted-averaging of effect sizes (random intercept for source study).
total_n_new <- data_reextracted_criterion_clinically_relevant %>%
  distinct(article, n_from_paper) %>%
  group_by(article) %>%
  filter(n_from_paper == max(n_from_paper)) %>%
  ungroup() %>%
  summarize(total_n = sum(n_from_paper, na.rm = TRUE)) %>%
  pull(total_n)

# exclusions and transformations
data_for_meta_new <- data_reextracted_criterion_clinically_relevant %>%
  mutate(variables = paste(irap_variable, criterion_variable, sep = "-")) %>%
  dplyr::select(article, variables, r_from_paper, n_from_paper, authors_include_bh) %>%
  na.omit() %>%
  escalc(measure = "ZCOR", 
         ri = r_from_paper, 
         ni = n_from_paper,
         data = ., 
         slab = paste(article, variables), 
         digits = 10) %>%
  rename(ni = n_from_paper) %>%
  dplyr::select(article, variables, yi, vi, ni, authors_include_bh) %>%
  na.omit() %>%
  group_by(article) %>%
  mutate(es_number = row_number(),
         article_abbreviated = paste(article, es_number)) %>%
  ungroup() %>%
  mutate(lower = yi - sqrt(vi)*1.96,
         upper = yi + sqrt(vi)*1.96)

# fit 
fit_new <- 
  rma.mv(yi     = yi, 
         V      = vi, 
         random = ~ 1 | article, 
         method = "REML", 
         data   = data_for_meta_new,
         slab   = article_abbreviated,
         digits = 10)

predictions_new <- 
  predict(fit_new) %>%
  as.data.frame() %>%
  select(-se) %>%
  # cr following Field & Gillett (2010) equation 5
  mutate(cr.lb = pred - 1.96*sqrt(fit_new$tau2),
         cr.ub = pred + 1.96*sqrt(fit_new$tau2))

predictions_new_backtransformed <- predictions_new %>%
  mutate_all(transf.ztor) %>%
  round_df(2)

# plot
data_forest_new <- data_for_meta_new %>%
  drop_na() %>%
  select(article_abbreviated,
         n = ni,
         r = yi,
         lower,
         upper) %>%
  # bind_rows(tibble(article_abbreviated = "Meta",
  #                  n = 35, # arbitrary number to make the diamond an appropriate size
  #                  r = predictions_new$pred,
  #                  lower = predictions_new$ci.lb,
  #                  upper = predictions_new$ci.ub)) %>%
  bind_rows(tibble(article_abbreviated = c("Meta-analysis (3-level RE confidence interval)", 
                                           "Meta-analysis (3-level RE credibility interval)",
                                           "Meta-analysis (3-level RE prediction interval)"),
                   n = 35, # arbitrary number to make the diamond an appropriate size
                   r = c(predictions_new$pred, 
                         predictions_new$pred, 
                         predictions_new$pred),
                   lower = c(predictions_new$ci.lb, 
                             predictions_new$cr.lb, 
                             predictions_new$pi.lb),
                   upper = c(predictions_new$ci.ub, 
                             predictions_new$cr.ub, 
                             predictions_new$pi.ub))) %>%
  mutate(r = transf.ztor(r),
         lower = transf.ztor(lower),
         upper = transf.ztor(upper)) %>%
  mutate(size = n/sum(n),
         n = ifelse(str_detect(article_abbreviated, "Meta"), total_n_new, n),
         article_abbreviated = fct_relevel(article_abbreviated,
                                           "Meta-analysis (3-level RE confidence interval)", 
                                           "Meta-analysis (3-level RE credibility interval)",
                                           "Meta-analysis (3-level RE prediction interval)")) %>%
  mutate(` ` = paste(rep(" ", 35), collapse = " ")) %>%
  select(Article = article_abbreviated, ` `, r, lower, upper, n, size) %>%
  mutate(r     = round_half_up(r, 2),
         lower = round_half_up(lower, 2),
         upper = round_half_up(upper, 2))

p_new <- 
  forestploter::forest(data       = select(data_forest_new, -size),
                       est        = data_forest_new$r,
                       lower      = data_forest_new$lower, 
                       upper      = data_forest_new$upper,
                       sizes      = 1/data_forest_new$size,
                       # is_summary = c(rep(FALSE, nrow(data_forest_new)-1), TRUE),
                       is_summary = c(rep(FALSE, nrow(data_forest_new)-3), 
                                      rep(TRUE, 3)),
                       ci_column  = 2,
                       ref_line   = 0,
                       theme      = custom_theme,
                       xlim       = c(-0.7, 1.1),
                       ticks_at   = c(-.6, -.4, -.2, 0, .2, .4, .6, .8, 1.0))

p_new

ggplot2::ggsave(filename = "plots/forest plot new multilevel model.pdf", 
                plot = p_new,
                device = "pdf",
                width = 8, 
                height = 39, 
                units = "in")

Meta effect: k = 171, r = 0.22, 95% CI [0.15, 0.29], 95% CR [0.22, 0.22], 95% PI [-0.01, 0.42], p = 0.000000004.

Power analyses

data_power_new <- data_power_verified %>%
  mutate(effect_size_new = c(predictions_new_backtransformed$pred,
                             predictions_new_backtransformed$ci.lb,
                             predictions_new_backtransformed$pred,
                             predictions_new_backtransformed$ci.lb,
                             r_to_d(predictions_new_backtransformed$pred),
                             r_to_d(predictions_new_backtransformed$ci.lb),
                             r_to_d(predictions_new_backtransformed$pred),
                             r_to_d(predictions_new_backtransformed$ci.lb)),
         effect_size_new = round_half_up(effect_size_new, 2))

point_estimate_new <- predictions_new_backtransformed$pred
lower_ci_new <- predictions_new_backtransformed$ci.lb

data_power_new$required_n_new[1] <- 
  pwr.r.test(n = NULL, 
             r = point_estimate_new, 
             sig.level = 0.05, 
             power = 0.8, 
             alternative = "greater")$n %>%
  ceiling()

data_power_new$required_n_new[2] <- 
  pwr.r.test(n = NULL, 
             r = lower_ci_new, 
             sig.level = 0.05, 
             power = 0.8, 
             alternative = "greater")$n %>%
  ceiling()

data_power_new$required_n_new[3] <- 
  pwr.r.test(n = NULL, 
             r = point_estimate_new, 
             sig.level = 0.05, 
             power = 0.8, 
             alternative = "two.sided")$n %>%
  ceiling()

data_power_new$required_n_new[4] <- 
  pwr.r.test(n = NULL, 
             r = lower_ci_new, 
             sig.level = 0.05, 
             power = 0.8, 
             alternative = "two.sided")$n %>%
  ceiling()

data_power_new$required_n_new[5] <- 
  pwr.t.test(n = NULL, 
             d = r_to_d(point_estimate_new), 
             type = "two.sample",
             sig.level = 0.05, 
             power = 0.8, 
             alternative = "greater")$n %>%
  ceiling()*2

data_power_new$required_n_new[6] <- 
  pwr.t.test(n = NULL, 
             d = r_to_d(lower_ci_new), 
             type = "two.sample",
             sig.level = 0.05, 
             power = 0.8, 
             alternative = "greater")$n %>%
  ceiling()*2

data_power_new$required_n_new[7] <- 
  pwr.t.test(n = NULL, 
             d = r_to_d(point_estimate_new), 
             type = "paired",
             sig.level = 0.05, 
             power = 0.8, 
             alternative = "greater")$n %>%
  ceiling()

data_power_new$required_n_new[8] <- 
  pwr.t.test(n = NULL, 
             d = r_to_d(lower_ci_new), 
             type = "paired",
             sig.level = 0.05, 
             power = 0.8, 
             alternative = "greater")$n %>%
  ceiling()

# print
data_power_new %>%
  kable() %>%
  kable_classic(full_width = FALSE)
test tails estimate effect_size_vahey required_n_vahey required_n_verified effect_size_new required_n_new
Pearson’s r One-tailed Point estimate 0.45 29 29 0.22 126
Pearson’s r One-tailed Lower bound of 95% Confidence Interval 0.40 37 37 0.15 273
Pearson’s r Two-tailed Point estimate 0.45 36 36 0.22 160
Pearson’s r Two-tailed Lower bound of 95% Confidence Interval 0.40 NA 46 0.15 346
Independent t-test (Cohen’s d) One-tailed Point estimate 1.01 26 26 0.45 124
Independent t-test (Cohen’s d) One-tailed Lower bound of 95% Confidence Interval 0.87 36 34 0.30 270
Dependent t-test (Cohen’s d) One-tailed Point estimate 1.01 8 8 0.45 32
Dependent t-test (Cohen’s d) One-tailed Lower bound of 95% Confidence Interval 0.87 10 10 0.30 69
# compare power analysis sample sizes to the published literature
data_power_new %>%
  rowwise() %>%
  mutate(percent_of_publications_meeting_required_n_verified = janitor::round_half_up(mean(data_sample_sizes_vector > required_n_verified)*100, 1),
         percent_of_publications_meeting_required_n_new = janitor::round_half_up(mean(data_sample_sizes_vector > required_n_new)*100, 1)) %>%
  ungroup() %>%
  select(test, tails, 
         required_n_verified,
         percent_of_publications_meeting_required_n_verified,
         required_n_new,
         percent_of_publications_meeting_required_n_new) %>%
  kable() %>%
  kable_classic(full_width = FALSE)
test tails required_n_verified percent_of_publications_meeting_required_n_verified required_n_new percent_of_publications_meeting_required_n_new
Pearson’s r One-tailed 29 75.5 126 2.1
Pearson’s r One-tailed 37 54.3 273 0.0
Pearson’s r Two-tailed 36 55.9 160 1.1
Pearson’s r Two-tailed 46 39.4 346 0.0
Independent t-test (Cohen’s d) One-tailed 26 77.7 124 2.1
Independent t-test (Cohen’s d) One-tailed 34 59.0 270 0.0
Dependent t-test (Cohen’s d) One-tailed 8 99.5 32 62.2
Dependent t-test (Cohen’s d) One-tailed 10 98.9 69 11.7
data_sample_sizes |>
  distinct(Title, study_design_ignoring_trial_type_comparisons) |>
  count(study_design_ignoring_trial_type_comparisons) |>
  mutate(percent = janitor::round_half_up(n/sum(n)*100, 1)) |>
  kable() |>
  kable_classic(full_width = FALSE)
study_design_ignoring_trial_type_comparisons n percent
between 74 48.1
mixed 32 20.8
within 48 31.2